################################
#	John Henderson
#	Gerrymandering Incumbency 
#		(with Brian Hamel and Aaron Goldzimer)
#		January 1, 2018
#
################################
# AD t.test results function
################################

rm(list=ls())
library(stringr)    
library(foreign) 
library(mice)  
library(matrixStats)

setwd('~/Dropbox/StateRedistricting/replication/short')

state=chamber=NULL
flips=NULL   

imputeNAplans=function(dists,plans=NULL){
	# imputing missing cells for partial plans
	# can't just align districts by number, since these have geographical 
	# definition; and don't have geographical boundaries for non-districts
	# in shapefiles, etc. 
	
	# hard and fast rule: sort districts on the pr(Obama 08) for whole plans, 
	# then match existing districts in partial plans to those that reduce the k-plan
	# distances
	if(is.null(plans)){
		warning('No index for plans is included: care is required to map back onto the various alternatives.')
	}   
	# length of datas
	m=length(dists)
	n=nrow(dists[[1]])  
	# length of completes
	cm=0
	#mu=array(NA,m)  
	is.cm=array(FALSE,m)  
	ord.dists=list() 
	ord.mat=matrix(NA,n,length(dists))
	for(i in 1:m){
		if(all(complete.cases(dists[[i]]))){
			cm=cm+1
			is.cm[i]=T
			mu=dists[[i]][,1]/(dists[[i]][,1]+dists[[i]][,2])
			ord.dists[[i]]=dists[[i]][order(mu),]
			
			ord.mat[,i]=t(t(sort(mu)))
		}
	}   
	
	for(i in 1:m){  
		if(is.cm[i]==F){
			m0=sort(dists[[i]][,1]/(dists[[i]][,1]+dists[[i]][,2]),na.last=T)
			n0=as.numeric(names(m0))
			n1=array(NA,n)
			for(p in 1:length(m0)){ 
				if(is.na(m0[p])){
					break()
				}
				dd=sqrt(rowSums((ord.mat-m0[p])^2,na.rm=T)) 
				if(length(which(!is.na(n1)))>0){
					dd[1:max(which(!is.na(n1)))]=NA
				}
				n1[which(dd==min(dd,na.rm=T))]=n0[p]
				
				#if dd object is approaching length(m0) *too fast*
				if(p<length(m0[!is.na(m0)]) & max(which(is.na(n1)))<length(m0)){
					n1=c(n1[-c(max(which(is.na(n1))))],NA)
				} 			      			
			}
			n1[which(is.na(n1))]=sample(n0[which(is.na(m0))],replace=F)	
			ord.dists[[i]]=dists[[i]][n1,]		   		
		}
	}
	 
	# quasi-randomization simulation : plans that have missing districts
	#  will have missings imputed for *all* the other plans that have those 
	#  districts : average district bias will be recorded ... 
	# complete plans are analyzed as is
	dist_insa=matrix(NA,n,1)
	dist_ins=matrix(NA,n,m) 
	colnames(dist_ins)=plans
	for(i in 1:m){  
		if(is.cm[i]==F){
			qn=ord.dists[[i]][,1]/(ord.dists[[i]][,1]+ord.dists[[i]][,2])-ord.dists[[i]][,2]/(ord.dists[[i]][,1]+ord.dists[[i]][,2])			
		  	mat_temp=matrix(qn,n,sum(is.cm))
			jj=0
			for(j in which(is.cm)){
				jj=jj+1
				qs=ord.dists[[j]][,1]/(ord.dists[[j]][,1]+ord.dists[[j]][,2])-ord.dists[[j]][,2]/(ord.dists[[j]][,1]+ord.dists[[j]][,2])							
				mat_temp[which(is.na(qn)),jj]=qs[which(is.na(qn))]				
			}                                                     
			colnames(mat_temp)=paste(plans[i],'sim',1:jj,sep='_')
			if(mean(complete.cases(dist_insa))==0){
				dist_insa=mat_temp
			} else {
				dist_insa=cbind(dist_insa,mat_temp)
			}
		} else {
			dist_ins[,i]=ord.dists[[i]][,1]/(ord.dists[[i]][,1]+ord.dists[[i]][,2])-ord.dists[[i]][,2]/(ord.dists[[i]][,1]+ord.dists[[i]][,2])			
		}
	} 
	if(mean(is.cm)<1){
		dist_ins=cbind(dist_ins[,which(is.cm==T)],dist_insa)
	}
	return(dist_ins) 
}
 
############################
#
# counterfactual maps 2010
#
############################
                          
states=c(
  "AL","AK","AZ","AR","CA",  "CO","CT","DE","FL",  "GA","HI","ID","IL","IN",  "IA","KS","KY","LA","ME",  "MT","NE","NV","NH","NJ", "OR",  
  "NM","NY","NC","ND","OH",  "OK","MD","MA","MI",  "MN","MS","MO","PA","RI",  "SC","SD","TN","TX","UT",  "VT","VA","WA","WV","WI", "WY")

states=sort(states)
     
plans=read.csv('redist_authority.csv',stringsAsFactors=F)    
plans10=plans[which(plans[,1]==2010),]
included=array(FALSE,nrow(plans10))
for(i in 1:length(states)){
	included[which(states[i]==plans10[,2])]=TRUE
}

plans10=plans10[included,]
plans10=plans10[order(plans10[,2]),]

plansCD10 = array(NA,length(states))
plansCD10[which(plans10$congress_court==1)]='C'
plansCD10[which(plans10$congress_independent==1)]='I'
plansCD10[which(plans10$congress_politician==1)]='B'
plansCD10[which(is.na(plansCD10) & plans10$congress_legislatue_partycontrol=='nonpartisan')]='B' # NE
plansCD10[which(is.na(plansCD10) & plans10$congress_legislatue_partycontrol=='split')]='B'
plansCD10[which(is.na(plansCD10) & plans10$congress_legislatue_partycontrol=='D')]='D'
plansCD10[which(is.na(plansCD10) & plans10$congress_legislatue_partycontrol=='R')]='R'
plansCD10[which(is.na(plansCD10))]='N'

plansAD10 = array(NA,length(states))
plansAD10[which(plans10$state_court==1)]='C'
plansAD10[which(plans10$state_independent==1)]='I'
plansAD10[which(plans10$state_politician==1)]='B'
plansAD10[which(is.na(plansAD10) & plans10$state_legislature_partycontrol=='nonpartisan')]='B' # NE
plansAD10[which(is.na(plansAD10) & plans10$state_legislature_partycontrol=='split')]='B'
plansAD10[which(is.na(plansAD10) & plans10$state_legislature_partycontrol=='D')]='D'
plansAD10[which(is.na(plansAD10) & plans10$state_legislature_partycontrol=='R')]='R'
plansAD10[which(is.na(plansAD10))]='N'
 
plansSD10=plansAD10
 
#############################
#
# ad
#
#############################
                                                              
#############################  
## Florida has multiple plans and so need to adjust for the court map in 2015/2016
#############################
#folders=paste('AD_plans/',system('ls ~/Dropbox/StateRedistricting/Data/plans2010/FL/AD/AD_plans/',intern=T),sep='')
#full_plans=read.csv('~/Dropbox/StateRedistricting/Data/plans2010/FL/scraped/PlanDataIns.csv',header=F,stringsAsFactors=F)
#dates=array(NA,length(folders))
#folders=gsub(folders,pattern='AD_plans/',replace='') 
#for(i in 2:length(folders)){
#	ix=which(gsub(folders[i],pattern='_adopted',replace='')==full_plans[,1])
#	dates[i]=str_sub(full_plans[ix,3],7)
#}
#dates[1]='2002'  
#save(dates,folders,file='~/Dropbox/StateRedistricting/replication/short/AD_FL_dates.Rdata')
load('plans2010/AD_FL_dates.Rdata') 
#############################
      
states_cf=c('AK','AZ','CA','CO','FL','ID','MT','NC','NM','NV','OH','SC','TX','VA','WA')    
ix=c(which(states=='AK'),
	which(states=='AZ'),
	which(states=='CA'),
	which(states=='CO'),	
	which(states=='FL'),
	which(states=='ID'),	
	which(states=='MT'),
	which(states=='NC'),	 
	which(states=='NM'),
	which(states=='NV'),	 	
	which(states=='OH'),	
	which(states=='SC'),
	which(states=='TX'),
	which(states=='VA'),
	which(states=='WA'))
# 


plansAD10=plansAD10[ix] 
plansAD10[which(states_cf=='AK')]='I'
plansAD10[which(states_cf=='MT')]='I'
plansAD10[which(states_cf=='ID')]='I'
plansAD10[which(states_cf=='FL')]='R' 
plansAD10[which(states_cf=='OH')]='R'
         
# state-level OH politician committee looks partisan to me... 
states_ix=states_cf[c(which(plansAD10=='D'),which(plansAD10=='R'),which(plansAD10=='B'),which(plansAD10=='C'),which(plansAD10=='I'))]     
pl10=plansAD10[c(which(plansAD10=='D'),which(plansAD10=='R'),which(plansAD10=='B'),which(plansAD10=='C'),which(plansAD10=='I'))]     
# note: no state-level simulations 

######################
# flips to do here
###################### 

load('voteData.Rdata')
load('stateSwings.Rdata')

iq=0
cnts=length(states_ix)+1 

propGT=matrix(NA,length(states_ix),4)   

kk=c()         
ss=c()
ii=c()  

source('analysisSTFun.R') 

for (state in states_ix){  
	iq=iq+1;
	cnts=cnts-1
    chamber='AD'

	st=analysisST(state=state,chamber=chamber,flips=flips,m0=m0,m1=1000000)	
	k_get=st$k_get
	adopted=st$adopted 
	app=ap=st$ap
	opp=op=st$op	   

	# remove those with extra but simular adopted plans
	if(length(grep(names(k_get),pattern='adopted'))>1){
		names(k_get)[grep(names(k_get),pattern='adopted')[-c(1)]]=gsub(names(k_get)[grep(names(k_get),pattern='adopted')[-c(1)]],pattern='adopted_',replace='')
		adopted=adopted[1]
	}   
	
	k=ncol(op)
	m=nrow(op)
	k_get=array(NA,k)	
	
	if(length(dim(ap)[2])==0){
        for(j in 1:k){
			k_get[j]=mean(abs(ap)>abs(op[,j]),na.rm=T)
		}
	} else {
		for(j in 1:k){
			k_get[j]=mean(abs(rowMeans(ap,na.rm=T))>abs(op[,j]),na.rm=T)
		}
	}       
	
	kk=c(kk,k_get)
	ss=c(ss,rep(length(k_get),length(k_get))) 
	ii=c(ii,rep(pl10[iq],length(k_get))) 
	
	propGT[iq,1]=mean(k_get,na.rm=T)
	 
	st=analysisST(state=state,chamber=chamber,flips=flips,m0=m0,m1=1000000)	
	k_get=st$k_get
	adopted=st$adopted 
	ap=st$ap
	op=st$op   
    # adopted & k_get are the relevant objects  
    #names(k_get)[adopted]=paste('adopted',names(k_get)[adopted],sep='_')
		
	# remove those with extra but simular adopted plans
	if(length(grep(names(k_get),pattern='adopted'))>1){
		names(k_get)[grep(names(k_get),pattern='adopted')[-c(1)]]=gsub(names(k_get)[grep(names(k_get),pattern='adopted')[-c(1)]],pattern='adopted_',replace='')
		adopted=adopted[1]
	}
	
	#m=length(adopted)
	avg.sims=2
	if(avg.sims==1){ # average over partial plans 
		kk_get=avgSims(k_get,discard=F)
	} else if(avg.sims==2){ # retain all partial plans
		kk_get=k_get
	} else if(avg.sims==3){ # discard all partial plans 
		kk_get=avgSims(k_get,discard=T)		
	}
	sts=state
	propGT[iq,2]=mean(kk_get[c(grep(names(kk_get),pattern='adopted'))]>kk_get[-c(grep(names(kk_get),pattern='adopted'))])
	
	propGT[iq,3]=mean(abs(app))-mean(abs(opp))
	propGT[iq,4]=mean(mean(abs(app))>(colMeans(abs(opp))))
}
  
t.test(propGT[pl10=='I',4],c(propGT[pl10=='B',4],propGT[pl10=='D',4],propGT[pl10=='R',4]))    
#
#	Welch Two Sample t-test
#
#data:  propGT[pl10 == "I", 4] and c(propGT[pl10 == "B", 4], propGT[pl10 == "D", 4], propGT[pl10 == propGT[pl10 == "I", 4] and     "R", 4])
#t = -1.7497, df = 9.994, p-value = 0.1108
#alternative hypothesis: true difference in means is not equal to 0
#95 percent confidence interval:
# -0.51476793  0.06193535
#sample estimates:
#mean of x mean of y 
#0.5589028 0.7853191
#propGT=(cbind(pl10,propGT))
 
#ii=as.factor(ii)
#ii_mat=matrix(0,length(ii),3)
#ii_mat[which(ii=='B'),1]=1
#ii_mat[which(ii=='C'),2]=1
#ii_mat[which(ii=='R'),3]=1 
#colnames(ii_mat)=c('B','C','R')
#summary(lm(y~ss+ii_mat))
         
#t.test(propGT[pl10=='I',1],c(propGT[pl10=='B',1],propGT[pl10=='D',1],propGT[pl10=='R',1]))
#t.test(propGT[pl10=='I',2],c(propGT[pl10=='B',2],propGT[pl10=='D',2],propGT[pl10=='R',2]))                                                                                          
#t.test(c(propGT[pl10=='I',1],propGT[pl10=='I',2]),c(propGT[pl10=='B',1],propGT[pl10=='D',1],propGT[pl10=='R',1],propGT[pl10=='B',2],propGT[pl10=='D',2],propGT[pl10=='R',2]))
#0.4994018 0.6568372; p-value = 0.1365                                                                                           

#t.test(propGT[pl10=='C',1],c(propGT[pl10=='B',1],propGT[pl10=='D',1],propGT[pl10=='R',1]))
#t.test(propGT[pl10=='C',2],c(propGT[pl10=='B',2],propGT[pl10=='D',2],propGT[pl10=='R',2]))
#t.test(c(propGT[pl10=='C',1],propGT[pl10=='C',2]),c(propGT[pl10=='B',1],propGT[pl10=='D',1],propGT[pl10=='R',1],propGT[pl10=='B',2],propGT[pl10=='D',2],propGT[pl10=='R',2]))       
 
#t.test(propGT[pl10=='I',1],c(propGT[pl10=='C',1]))
#t.test(propGT[pl10=='I',2],c(propGT[pl10=='C',2]))
#t.test(c(propGT[pl10=='I',1],propGT[pl10=='I',2]),c(propGT[pl10=='C',1],propGT[pl10=='C',2]))
                                                                                                                                                                                   
#end	